library(MASS)
library(parallel)
library(fastglm)
library(dplyr)
library(vctrs)
library(data.table)
## Project:     Conditional rank-rank regression
## Task:        Functions for CRRR
## Author:      Jonas Meier
## Date:        1.10.2024


# Main Functionality ------------------------------------------------------

# Description: crrr provides point estimates on the conditional rank-rank 
# ression using (i) uniform and (ii) normal ranks as well as the conventional
# rank-rank regression. inf.crrr additionally provides standard errors on these
# estimates. As a byproduct, the coefficients on the univariate distribution
# regressions are provided too.

# Arguments:
# f1,f2:        Formulas for the univariate DRs
# data:         Data containing all variables used in f1 and f2
# quantiles:    Sequence of quantiles used for the DRs
# link:         Link function used for the DRs ("probit" or "logit")
# parallel:     Boolean, whether computation should be parallelized
# cores:        Integer, number of cores that should be used for parallelization.
#               If all should be used: "ALL"
# coef_interp:  Boolean, whether coefficients on the DRs should be interpolated
# rr_subs:      Boolean of length nrow(data) for which subsets of the data the
#               rank-rank regression should be performed
# restricted:   Boolean, whether the estimator should be restricted (no 
#               intercept in the rank-rank regressions)
# fully_restricted: Boolean, whether the fully restricted estimator should be 
#               computed (12*mean(rank1*rank2))
# cor_based:    Boolean, whether the correlation based estimator should be computed
# reverse:      Boolean, whether the reverse estimator should be computed 
#               (ranks are interchanged)
# rank_out:     Boolean, whether the estimated ranks should be part of the output

# Additional arguments for inf.crrr
# alpha:        Level of statistical significance
# b.boot:       Number of bootstrap replications
# s.size:       Size of bootstrap subsamples as fraction of total sample. Default
#               is 1, implying subsamples have the same size as original sample.
# ranks_pe:     Point estimates on ranks. Can be provided to speed up computation
# ranks_bs:     Bootstrapped ranks. Can be provided to speed up computation


crrr            <- function(f1,f2,data,quantiles=seq(from=.05, to=.95, by=.9/(20-1)),link="probit",parallel=T,cores="ALL",coef_interp=TRUE,rr_subs=NULL,
                            restricted=FALSE,fully_restricted=FALSE,cor_based=FALSE,reverse=FALSE,rank_out=FALSE){
  # Preparation -------------------------------------------------------------
  # Regressors and Outcomes
  data    <- na.omit(data)
  X1      <- model.matrix(f1, data=data)
  X2      <- model.matrix(f2, data=data)
  y1      <- as.matrix(data[,paste(f1)[2]])
  y2      <- as.matrix(data[,paste(f2)[2]])
  
  # Grid
  #q1      <- quantile(as.matrix(y1), quantiles) # quantile (ventile) values for y1
  #q1l     <- as.list(as.data.frame(t(q1)))
  #q2      <- quantile(as.matrix(y2), quantiles) # quantile (ventile) values for y2
  #q2l     <- as.list(as.data.frame(t(q2)))
  
  # Unconditional Ranks
  #ur1 <- rank(y1)/nrow(data)
  #ur2 <- rank(y2)/nrow(data)
  
  
  # Y VARIABLES ALREADY RANKED
  q1      <- seq(from=.05, to=.95, by=.9/(20-1))*100
  q1l     <- as.list(as.data.frame(t(q1)))
  q2      <- seq(from=.05, to=.95, by=.9/(20-1))*100
  q2l     <- as.list(as.data.frame(t(q2)))
  
  # Unconditional Ranks
  ur1 <- y1
  ur2 <- y2
  
  
  # Estimation Ranks --------------------------------------------------------
  
  # Distribution Regression
  if(parallel==F){
    
    # Y1 on X1
    coef1   <- lapply(q1l,dr_na,y=y1,reg=X1,link=link)
    coef1   <- do.call(rbind, coef1)
    
    # Y2 on X2
    coef2   <- lapply(q2l,dr_na,y=y2,reg=X2,link=link)
    coef2   <- do.call(rbind, coef2)
    
  }else{
    
    # Y1 on X1
    if(cores=="ALL"){
      n_clus  <- detectCores()
      cl      <- makeCluster(n_clus)
    }else{
      cl      <- makeCluster(cores)
    }
    clusterEvalQ(cl,{library(fastglm)})
    clusterExport(cl,c("dr_na"))
    coef1    <- clusterApply(cl,as.matrix(q1), function(t,y,reg,link,method) dr_na(t,y,reg,link,method), y=y1, reg=X1, link=link, method=2)
    stopCluster(cl)
    closeAllConnections()
    coef1 <- do.call(rbind, coef1)
    
    # Y2 on X2
    if(cores=="ALL"){
      n_clus  <- detectCores()
      cl      <- makeCluster(n_clus)
    }else{
      cl      <- makeCluster(cores)
    }
    clusterEvalQ(cl,{library(fastglm)})
    clusterExport(cl,c("dr_na"))
    coef2    <- clusterApply(cl,as.matrix(q2), function(t,y,reg,link,method) dr_na(t,y,reg,link,method), y=y2, reg=X2, link=link, method=2)
    stopCluster(cl)
    closeAllConnections()
    coef2 <- do.call(rbind, coef2)
  }
  
  # Interpolation of coefficients to length of data
  if(coef_interp==TRUE){
    quant_n <- seq(min(quantiles), max(quantiles), (max(quantiles) - min(quantiles))/(nrow(data)-1) )
    q1_n <- quantile(as.matrix(y1), quant_n)
    q2_n <- quantile(as.matrix(y2), quant_n)
    int_coef1 <- apply(coef1, 2, approx, x=q1, xout=q1_n)
    int_coef1 <- sapply(int_coef1, "[[", 2)
    int_coef2 <- apply(coef2, 2, approx, x=q2, xout=q2_n)
    int_coef2 <- sapply(int_coef2, "[[", 2)
    
    # Location of observation
    if(parallel==F){
      loc1 <- apply(as.matrix(y1), 1, u_loc, q=q1_n)
      loc2 <- apply(as.matrix(y2), 1, u_loc, q=q2_n)
    }else{
      n_clus  <- detectCores()
      cl      <- makeCluster(n_clus)
      loc1    <- parApply(cl, as.matrix(y1), 1, u_loc, q=q1_n)
      stopCluster(cl)
      closeAllConnections()
      
      n_clus  <- detectCores()
      cl      <- makeCluster(n_clus)
      loc2    <- parApply(cl, as.matrix(y2), 1, u_loc, q=q2_n)
      stopCluster(cl)
      closeAllConnections()
    }
    
    # Sort regressors and coefs for apply
    reg_coef_i1 <- vector("list", nrow(X1))
    reg_coef_i2 <- vector("list", nrow(X2))
    for(i in 1:nrow(X1)){
      reg_coef_i1[[i]] <- rbind(X1[i,], int_coef1[loc1[i], ])
      reg_coef_i2[[i]] <- rbind(X2[i,], int_coef2[loc2[i], ])
    }
    
  }else{
    
    # Location of observation
    if(parallel==F){
      loc1 <- apply(as.matrix(y1), 1, u_loc, q=q1)
      loc2 <- apply(as.matrix(y2), 1, u_loc, q=q2)
    }else{
      n_clus  <- detectCores()
      cl      <- makeCluster(n_clus)
      loc1 <- parApply(cl, as.matrix(y1), 1, u_loc, q=q1)
      stopCluster(cl)
      closeAllConnections()
      
      n_clus  <- detectCores()
      cl      <- makeCluster(n_clus)
      loc2 <- parApply(cl, as.matrix(y2), 1, u_loc, q=q2)
      stopCluster(cl)
      closeAllConnections()
    }
    
    # Sort regressors and coefs for apply
    reg_coef_i1 <- vector("list", nrow(X1))
    reg_coef_i2 <- vector("list", nrow(X2))
    for(i in 1:nrow(X1)){
      reg_coef_i1[[i]] <- rbind(X1[i,], coef1[loc1[i], ])
      reg_coef_i2[[i]] <- rbind(X2[i,], coef2[loc2[i], ])
    }
  }
  
  # Uniform, conditional ranks (R_i = Lambda(X_i'hat(beta)(y)))
  if(link=="probit"){
    r1 <- lapply(reg_coef_i1, function(x) pnorm(x[1,]%*%x[2,]))
    r1 <- do.call(rbind, r1)
    r2 <- lapply(reg_coef_i2, function(x) pnorm(x[1,]%*%x[2,]))
    r2 <- do.call(rbind, r2)
  }
  if(link=="logit"){
    r1 <- lapply(reg_coef_i1, function(x) exp(x[1,]%*%x[2,])/(1+exp(x[1,]%*%x[2,])) )
    r1 <- do.call(rbind, r1)
    r2 <- lapply(reg_coef_i2, function(x) exp(x[1,]%*%x[2,])/(1+exp(x[1,]%*%x[2,])) )
    r2 <- do.call(rbind, r2)
  }
  
  # Prepare ranks for output (before they are altered)
  if(rank_out==TRUE){
    data_ranks <- as.data.frame(cbind(r1,r2,ur1,ur2))
    colnames(data_ranks) <- c("r1", "r2", "ur1", "ur2")
  }
  
  # Normal Ranks
  nr1 <- qnorm(r1)
  nr2 <- qnorm(r2)
  nr1[nr1==Inf]   <- 1
  nr1[nr1==-Inf]  <- 0
  nr2[nr2==Inf]   <- 1
  nr2[nr2==-Inf]  <- 0
  
  if(restricted==TRUE | fully_restricted==TRUE){
    r1  <- r1-.5
    r2  <- r2-.5
    nr1 <- nr1-.5
    nr2 <- nr2-.5
    ur1 <- ur1-.5
    ur2 <- ur2-.5
  }
  
  # Estimation Slope Coefficient --------------------------------------------
  
  # Slope coefficient
  # Regression-based estimator
  if(cor_based==FALSE){
    
    # No subsamples for RR
    if(is.null(rr_subs)){
      
      if(reverse==FALSE){
        
        # First on second variable
        if(restricted==FALSE){
          b.coef.ur <- summary(lm(r1~r2))$coef[2,1]
          b.coef.rr <- summary(lm(ur1~ur2))$coef[2,1]
          
          a.coef.ur <- summary(lm(r1~r2))$coef[1,1]
          a.coef.rr <- summary(lm(ur1~ur2))$coef[1,1]
        }
        if(restricted==TRUE){
          b.coef.ur <- summary(lm(r1~0+r2))$coef[1]
          b.coef.rr <- summary(lm(ur1~0+ur2))$coef[1]
        }
        if(fully_restricted==TRUE){
          b.coef.ur <- 12*mean(r1*r2)
          b.coef.rr <- 12*mean(ur1*ur2)
        }
      }else{
        
        # Second on first variable
        if(restricted==FALSE){
          b.coef.ur <- summary(lm(r2~r1))$coef[2,1]
          b.coef.rr <- summary(lm(ur2~ur1))$coef[2,1]
          
          a.coef.ur <- summary(lm(r2~r1))$coef[1,1]
          a.coef.rr <- summary(lm(ur2~ur1))$coef[1,1]
        }
        if(restricted==TRUE){
          b.coef.ur <- summary(lm(r2~0+r1))$coef[1]
          b.coef.rr <- summary(lm(ur2~0+ur1))$coef[1]
        }
      }
      
    }else{
      
      if(reverse==FALSE){
        
        # First on second variable
        if(restricted==FALSE){
          b.coef.ur <- summary(lm(r1[rr_subs]~r2[rr_subs]))$coef[2,1]
          b.coef.rr <- summary(lm(ur1[rr_subs]~ur2[rr_subs]))$coef[2,1]
          
          a.coef.ur <- summary(lm(r1[rr_subs]~r2[rr_subs]))$coef[1,1]
          a.coef.rr <- summary(lm(ur1[rr_subs]~ur2[rr_subs]))$coef[1,1]
        }
        if(restricted==TRUE){
          b.coef.ur <- summary(lm(r1[rr_subs]~0+r2[rr_subs]))$coef[1]
          b.coef.rr <- summary(lm(ur1[rr_subs]~0+ur2[rr_subs]))$coef[1]
        }
        if(fully_restricted==TRUE){
          b.coef.ur <- 12*mean(r1[rr_subs]*r2[rr_subs])
          b.coef.rr <- 12*mean(ur1[rr_subs]*ur2[rr_subs])
        }
      }else{
        
        # Second on first variable
        if(restricted==FALSE){
          b.coef.ur <- summary(lm(r2[rr_subs]~r1[rr_subs]))$coef[2,1]
          b.coef.rr <- summary(lm(ur2[rr_subs]~ur1[rr_subs]))$coef[2,1]
        }
        if(restricted==TRUE){
          b.coef.ur <- summary(lm(r2[rr_subs]~0+r1[rr_subs]))$coef[1]
          b.coef.rr <- summary(lm(ur2[rr_subs]~0+ur1[rr_subs]))$coef[1]
        }
      }
      
    }
  }else{
    
    if(restricted==FALSE){
      b.coef.ur <- sum((r1-mean(r1))*(r2-mean(r2)))/(sqrt(sum((r1-mean(r1))^2)*sum((r2-mean(r2))^2)))
      b.coef.rr <- sum((ur1-mean(ur1))*(ur2-mean(ur2)))/(sqrt(sum((ur1-mean(ur1))^2)*sum((ur2-mean(ur2))^2)))
    }
    if(restricted==TRUE){
      # .5 is already subtracted
      b.coef.ur <- sum((r1)*(r2))/(sqrt(sum((r1)^2)*sum((r2)^2)))
      b.coef.rr <- sum((ur1)*(ur2))/(sqrt(sum((ur1)^2)*sum((ur2)^2)))
    }
  }
  
  # Results -----------------------------------------------------------------
  if(rank_out==FALSE){
    results <- list(b.coef.ur, b.coef.rr, a.coef.ur, a.coef.rr)
    names(results)  <- c("b.coef.ur", "b.coef.rr", "a.coef.ur", "a.coef.rr")
  }
  if(rank_out==TRUE){
    results <- list(b.coef.ur, b.coef.nr, b.coef.rr, data_ranks)
    names(results)  <- c("b.coef.ur", "b.coef.nr", "b.coef.rr", "cond_ranks")
  }
  return(results)
}










inf.crrr        <- function(f1,f2,data,alpha,b.boot,quantiles=seq(from=.05, to=.95, by=.9/(20-1)),link="probit",parallel=TRUE,s.size=1,coef_interp=TRUE,rr_subs=NULL,
                            restricted=FALSE,fully_restricted=FALSE,cor_based=FALSE,reverse=FALSE,rank_out=FALSE,ranks_pe=NULL,ranks_bs=NULL){
  
  # parallel = FALSE & ranks != NULL is not implemented
  # ranks_pe or ranks_bs provided and rank_out = TRUE
  
  # Point Estimates CRRR
  if(is.null(ranks_pe)){
    fit_crrr <- crrr(f1, f2, data, quantiles, parallel=parallel, link=link, coef_interp=coef_interp, rr_subs=rr_subs, 
                     restricted=restricted, fully_restricted=fully_restricted, cor_based=cor_based, reverse=reverse, rank_out=rank_out)
  }else{
    fit_crrr <- crrr_from_ranks(ranks_pe, restricted=restricted, fully_restricted=fully_restricted, cor_based=cor_based, reverse=reverse, rr_subs=rr_subs)
  }
  
  # Bootstrap: Non-parametric bootstrap for CI
  if(parallel==FALSE){
    bs_coef <- lapply(seq(1:b.boot), subs.crrr, f1=f1, f2=f2, data=data, quantiles=quantiles, s.size=s.size, coef_interp=coef_interp, rr_subs=rr_subs,
                      restricted=restricted, fully_restricted=fully_restricted, cor_based=cor_based, reverse=reverse, rank_out=rank_out)
  }
  if(parallel==TRUE){
    n_clus  <- detectCores()
    cl      <- makeCluster(n_clus)
    clusterEvalQ(cl,{library(fastglm)})
    clusterExport(cl,c("subs.crrr", "crrr", "dr_na", "u_loc", "crrr_from_ranks"))
    clusterSetRNGStream(cl, iseed = 123456789)
    if(is.null(ranks_bs)){
      bs_coef <- clusterApply(cl, 1:b.boot, function(i,f1,f2,data,quantiles,link,s.size,coef_interp,rr_subs,restricted,fully_restricted,cor_based,reverse,rank_out)
        subs.crrr(i,f1,f2,data,quantiles,link,s.size,coef_interp,rr_subs,restricted,fully_restricted,cor_based,reverse,rank_out),
        f1=f1, f2=f2, data=data, quantiles=quantiles, link=link, s.size=s.size, coef_interp=coef_interp, rr_subs=rr_subs, 
        restricted=restricted, fully_restricted=fully_restricted, cor_based=cor_based, reverse=reverse, rank_out=rank_out)
    }else{
      bs_coef <- clusterApply(cl, ranks_bs, function(ranks,restricted,fully_restricted,cor_based,reverse,rr_subs)
        crrr_from_ranks(ranks, restricted, fully_restricted, cor_based, reverse,rr_subs),
        restricted=restricted, fully_restricted=fully_restricted, cor_based=cor_based, reverse=reverse, rr_subs=rr_subs)
    }
    stopCluster(cl)
    closeAllConnections()
  }
  if(rank_out==FALSE){
    if(is.null(ranks_bs)){
      bs_coef     <- do.call(rbind, bs_coef)
      bs_coef_ur  <- bs_coef[,1]
      bs_coef_rr  <- bs_coef[,2]
      bs_a_coef_ur  <- bs_coef[,3]
      bs_a_coef_rr  <- bs_coef[,4]
    }else{
      bs_coef_ur  <- unlist(lapply(bs_coef, `[[`, 1))
      bs_coef_rr  <- unlist(lapply(bs_coef, `[[`, 2))
      
      bs_a_coef_ur  <- unlist(lapply(bs_coef, `[[`, 3))
      bs_a_coef_rr  <- unlist(lapply(bs_coef, `[[`, 4))
    }
  }else{
    bs_coef_ur  <- unlist(lapply(bs_coef, `[[`, 1))
    bs_coef_rr  <- unlist(lapply(bs_coef, `[[`, 2))
    
    bs_a_coef_ur  <- unlist(lapply(bs_coef, `[[`, 3))
    bs_a_coef_rr  <- unlist(lapply(bs_coef, `[[`, 4))
    
  }
  
  # Uniform Rank Results
  delta_coef_ur  <- bs_coef_ur - fit_crrr$b.coef.ur
  se_coef_ur     <- IQR(delta_coef_ur,na.rm=T)/(qnorm(.75)-qnorm(.25))
  ts_coef_ur     <- abs(delta_coef_ur)/se_coef_ur
  crit_coef_ur   <- quantile(ts_coef_ur, 1-alpha, na.rm=T)
  l_ci_b_ur      <- fit_crrr$b.coef.ur - crit_coef_ur*se_coef_ur
  u_ci_b_ur      <- fit_crrr$b.coef.ur + crit_coef_ur*se_coef_ur
  ci_b_ur        <- c(l_ci_b_ur, u_ci_b_ur)
  
  delta_a_coef_ur  <- bs_a_coef_ur - fit_crrr$a.coef.ur
  se_a_coef_ur     <- IQR(delta_a_coef_ur,na.rm=T)/(qnorm(.75)-qnorm(.25))
  
  delta_p25_coef_ur <- (bs_a_coef_ur+25*bs_coef_ur)-(fit_crrr$a.coef.ur+25*fit_crrr$b.coef.ur)
  se_p25_coef_ur     <- IQR(delta_p25_coef_ur,na.rm=T)/(qnorm(.75)-qnorm(.25))
  
  # RR Results
  delta_coef_rr  <- bs_coef_rr - fit_crrr$b.coef.rr
  #se_coef_rr     <- sd(delta_coef_rr,na.rm=T)
  se_coef_rr     <- IQR(delta_coef_rr,na.rm=T)/(qnorm(.75)-qnorm(.25))
  ts_coef_rr     <- abs(delta_coef_rr)/se_coef_rr
  crit_coef_rr   <- quantile(ts_coef_rr, 1-alpha, na.rm=T)
  l_ci_b_rr      <- fit_crrr$b.coef.rr - crit_coef_rr*se_coef_rr
  u_ci_b_rr      <- fit_crrr$b.coef.rr + crit_coef_rr*se_coef_rr
  ci_b_rr        <- c(l_ci_b_rr, u_ci_b_rr)
  
  delta_a_coef_rr  <- bs_a_coef_rr - fit_crrr$a.coef.rr
  se_a_coef_rr     <- IQR(delta_a_coef_rr,na.rm=T)/(qnorm(.75)-qnorm(.25))
  
  # Results -----------------------------------------------------------------
  if(rank_out==FALSE){
    results <- list(fit_crrr$b.coef.ur, se_coef_ur, (fit_crrr$a.coef.ur+25*fit_crrr$b.coef.ur), se_p25_coef_ur, 
                    fit_crrr$b.coef.rr, se_coef_rr, 
                    
                    fit_crrr$a.coef.ur, se_a_coef_ur,
                    fit_crrr$a.coef.rr, se_a_coef_rr
    )
    names(results)  <- c("b.coef.ur", "se.b.coef.ur", "p25_b", "p25_se",
                         "b.coef.rr", "se.b.coef.rr", 
                         "a.coef.ur", "se.a.coef.ur",
                         "a.coef.rr", "se.a.coef.rr")
  }
  if(rank_out==TRUE){
    results <- list(fit_crrr$b.coef.ur, se_coef_ur, ci_b_ur,
                    fit_crrr$b.coef.nr, se_coef_nr, ci_b_nr, 
                    fit_crrr$b.coef.rr, se_coef_rr, ci_b_rr,fit_crrr$cond_ranks, bs_ranks)
    names(results)  <- c("b.coef.ur", "se.coef.ur", "ci.bcoef.ur",
                         "b.coef.nr", "se.coef.nr", "ci.bcoef.nr", 
                         "b.coef.rr", "se.coef.rr", "ci.bcoef.rr", "cond_ranks_pe", "cond_ranks_bs")
  }
  return(results)
}

# Second-level functions --------------------------------------------------

dr_na             <- function(t,y,reg,link,method=2){
  yt <- as.numeric(y <= t)
  
  # Fast version: Cannot handle NAs
  fit <- tryCatch(fastglm(reg, yt, family = binomial(link = link), method=method)$coef,  error=function(e) NULL)
  
  # Handling cases with multicollinearity
  if(is.null(fit)){
    fit <- glm(yt ~ -1 + reg, family = binomial(link = link))$coef
    fit[is.na(fit)] <- 0
  }
  if(!is.null(fit)){return(fit)}
}
u_loc             <- function(y,q){
  which.min(abs(y-q))
}
subs.crrr         <- function(i,f1,f2,data,quantiles=seq(from=.05, to=.95, by=.9/(20-1)),link="probit",s.size=1,coef_interp=TRUE,rr_subs=NULL,
                              restricted=FALSE,fully_restricted=TRUE,cor_based=FALSE,reverse=FALSE,rank_out=FALSE){
  
  # Data
  rowid       <- seq(1:nrow(data))
  sample      <- sample(rowid, size=length(rowid)*s.size, replace=T)
  data_sample <- data[sample,]
  
  # Estimation
  fit_crrr    <- crrr(f1, f2, data_sample, quantiles, link, parallel=FALSE, coef_interp=coef_interp, rr_subs=rr_subs,
                      restricted=restricted, fully_restricted=fully_restricted, cor_based=cor_based, reverse=reverse, rank_out=rank_out)
  
  if(rank_out==FALSE){
    results <- c(fit_crrr$b.coef.ur, fit_crrr$b.coef.rr, fit_crrr$a.coef.ur, fit_crrr$a.coef.rr)
  }
  if(rank_out==TRUE){
    results <- list(fit_crrr$b.coef.ur, fit_crrr$b.coef.nr, fit_crrr$b.coef.rr, fit_crrr$cond_ranks)
  }
  return(results)
}
crrr_from_ranks   <- function(ranks, restricted=FALSE, fully_restricted=FALSE, cor_based=FALSE, reverse=FALSE, rr_subs=NULL){
  
  # Unconditional ranks
  ur1 <- ranks[,"ur1"]
  ur2 <- ranks[,"ur2"]
  
  # Conditional uniform ranks
  r1 <- ranks[,"r1"]
  r2 <- ranks[,"r2"]
  
  # Normal Ranks
  nr1 <- qnorm(r1)
  nr2 <- qnorm(r2)
  nr1[nr1==Inf]   <- 1
  nr1[nr1==-Inf]  <- 0
  nr2[nr2==Inf]   <- 1
  nr2[nr2==-Inf]  <- 0
  
  if(restricted==TRUE | fully_restricted==TRUE){
    r1  <- r1-.5
    r2  <- r2-.5
    nr1 <- nr1-.5
    nr2 <- nr2-.5
    ur1 <- ur1-.5
    ur2 <- ur2-.5
  }
  
  # Slope coefficient
  # Regression-based estimator
  if(cor_based==FALSE){
    
    # No subsamples for RR
    if(is.null(rr_subs)){
      
      if(reverse==FALSE){
        
        # First on second variable
        if(restricted==FALSE){
          b.coef.ur <- summary(lm(r1~r2))$coef[2,1]
          b.coef.rr <- summary(lm(ur1~ur2))$coef[2,1]
        }
        if(restricted==TRUE){
          b.coef.ur <- summary(lm(r1~0+r2))$coef[1]
          b.coef.rr <- summary(lm(ur1~0+ur2))$coef[1]
        }
        if(fully_restricted==TRUE){
          b.coef.ur <- 12*mean(r1*r2)
          b.coef.rr <- 12*mean(ur1*ur2)
        }
      }else{
        
        # Second on first variable
        if(restricted==FALSE){
          b.coef.ur <- summary(lm(r2~r1))$coef[2,1]
          b.coef.rr <- summary(lm(ur2~ur1))$coef[2,1]
        }
        if(restricted==TRUE){
          b.coef.ur <- summary(lm(r2~0+r1))$coef[1]
          b.coef.rr <- summary(lm(ur2~0+ur1))$coef[1]
        }
      }
      
    }else{
      
      if(reverse==FALSE){
        
        # First on second variable
        if(restricted==FALSE){
          b.coef.ur <- summary(lm(r1[rr_subs]~r2[rr_subs]))$coef[2,1]
          b.coef.rr <- summary(lm(ur1[rr_subs]~ur2[rr_subs]))$coef[2,1]
        }
        if(restricted==TRUE){
          b.coef.ur <- summary(lm(r1[rr_subs]~0+r2[rr_subs]))$coef[1]
          b.coef.rr <- summary(lm(ur1[rr_subs]~0+ur2[rr_subs]))$coef[1]
        }
        if(fully_restricted==TRUE){
          b.coef.ur <- 12*mean(r1[rr_subs]*r2[rr_subs])
          b.coef.rr <- 12*mean(ur1[rr_subs]*ur2[rr_subs])
        }
      }else{
        
        # Second on first variable
        if(restricted==FALSE){
          b.coef.ur <- summary(lm(r2[rr_subs]~r1[rr_subs]))$coef[2,1]
          b.coef.rr <- summary(lm(ur2[rr_subs]~ur1[rr_subs]))$coef[2,1]
        }
        if(restricted==TRUE){
          b.coef.ur <- summary(lm(r2[rr_subs]~0+r1[rr_subs]))$coef[1]
          b.coef.rr <- summary(lm(ur2[rr_subs]~0+ur1[rr_subs]))$coef[1]
        }
      }
      
    }
  }else{
    
    # No subsamples for RR
    if(is.null(rr_subs)){
      
      if(restricted==FALSE){
        b.coef.ur <- sum((r1-mean(r1))*(r2-mean(r2)))/(sqrt(sum((r1-mean(r1))^2)*sum((r2-mean(r2))^2)))
        b.coef.rr <- sum((ur1-mean(ur1))*(ur2-mean(ur2)))/(sqrt(sum((ur1-mean(ur1))^2)*sum((ur2-mean(ur2))^2)))
      }
      if(restricted==TRUE){
        # .5 is already subtracted
        b.coef.ur <- sum((r1)*(r2))/(sqrt(sum((r1)^2)*sum((r2)^2)))
        b.coef.rr <- sum((ur1)*(ur2))/(sqrt(sum((ur1)^2)*sum((ur2)^2)))
      }
    }else{
      
      if(restricted==FALSE){
        b.coef.ur <- sum((r1[rr_subs]-mean(r1[rr_subs]))*(r2[rr_subs]-mean(r2[rr_subs])))/(sqrt(sum((r1[rr_subs]-mean(r1[rr_subs]))^2)*sum((r2[rr_subs]-mean(r2[rr_subs]))^2)))
        b.coef.rr <- sum((ur1[rr_subs]-mean(ur1[rr_subs]))*(ur2[rr_subs]-mean(ur2[rr_subs])))/(sqrt(sum((ur1[rr_subs]-mean(ur1[rr_subs]))^2)*sum((ur2[rr_subs]-mean(ur2[rr_subs]))^2)))
      }
      if(restricted==TRUE){
        # .5 is already subtracted
        b.coef.ur <- sum((r1[rr_subs])*(r2[rr_subs]))/(sqrt(sum((r1[rr_subs])^2)*sum((r2[rr_subs])^2)))
        b.coef.rr <- sum((ur1[rr_subs])*(ur2[rr_subs]))/(sqrt(sum((ur1[rr_subs])^2)*sum((ur2[rr_subs])^2)))
      }
    }
  }
  
  results <- list(b.coef.ur, b.coef.nr, b.coef.rr)
  names(results)  <- c("b.coef.ur", "b.coef.nr", "b.coef.rr")
  return(results)
}
binorm_i          <- function(data){
  mvrnorm(n=1, mu=c(data[1], data[2]), Sigma=matrix(c(1, data[3], data[3], 1), ncol=2))
}
get_example_data  <- function(obs, nreg, cor, b1, b2, reg_type="binary", parallel=TRUE, seed=12345){
  
  # Set seed
  set.seed(seed)
  
  # Regressors
  if(reg_type=="binary"){
    x <- matrix(rbinom(obs*nreg, 1, .5), obs, nreg)
  }
  if(reg_type=="normal"){
    x <- matrix(rnorm(obs*nreg), obs, nreg)
  }
  colnames(x) <- paste("x", seq(1:(nreg)), sep="")
  
  # Observation-specific means to draw data from
  m1 <- x%*%b1
  m2 <- x%*%b2
  
  # Outcomes
  m1m2cor <- cbind(m1,m2,cor)
  if(parallel==FALSE){
    y1y2 <- t(apply(m1m2cor, 1, binorm_i))
  }else{
    n_clus  <- detectCores()
    cl      <- makeCluster(n_clus)
    clusterSetRNGStream(cl, iseed = seed)
    clusterEvalQ(cl,{library(MASS)})
    y1y2 <- t(parApply(cl, m1m2cor, 1, binorm_i))
    stopCluster(cl)
    closeAllConnections()
  }
  colnames(y1y2) <- c("y1", "y2")
  data <- as.data.frame(cbind(y1y2,x))
  return(data)
}

###############################################
# Part 1: refugee status as the only covariate
data <- fread("H:/Zheng_10223/Joint/cohort_2025.csv")
#f1=parent
#f2=child

datatest=data.frame(data[,c("MainParent_Income_HH_MainParentAge45_49_pctparent","refugeein","Child_Income_IND_30_34_pct","WORLD_AREA_BIRTH","SchoolingBins_Main","IntendedOccupation_Main",
                            "AnyEnglish_Main","AnyFrench_Main","IntendedOccupation_Main","LandingageMain","married")])



f1_refugeestat=as.formula(MainParent_Income_HH_MainParentAge45_49_pctparent ~ refugeein)
f2_refugeestat=as.formula(Child_Income_IND_30_34_pct ~ refugeein)



crrr_overall_inf <- inf.crrr(f1_refugeestat, f2_refugeestat, data=datatest, alpha=0.05, b.boot=10, link="logit", cor_based=FALSE, parallel=TRUE, rank_out=FALSE, reverse=TRUE)

#saveRDS(crrr_overall,"H:/Zheng_10223/ToVet/Output/crrr_overall.RData")
saveRDS(crrr_overall_inf,"H:/Zheng_10223/ToVet/Output/crrr_overall_inf.RData")


write.csv(data.frame(crrr_overall_inf),"H:/Zheng_10223/ToVet/Output/crrr_overall_inf.csv")



##############################
# Split into refuge and non refugee 


# make factors
datatest$WORLD_AREA_BIRTH=factor(datatest$WORLD_AREA_BIRTH)
datatest$SchoolingBins_Main=factor(datatest$SchoolingBins_Main)
datatest$IntendedOccupation_Main=factor(datatest$IntendedOccupation_Main)

datatest$englishfrench=ifelse(datatest$AnyEnglish_Main==1|datatest$AnyFrench_Main==1,1,0)

refugeedf=datatest[datatest$refugeein==1,]
nonrefugeedf=datatest[datatest$refugeein==0,]

############# Model 1: world Area
f1_refugeestat=as.formula(MainParent_Income_HH_MainParentAge45_49_pctparent ~ WORLD_AREA_BIRTH)
f2_refugeestat=as.formula(Child_Income_IND_30_34_pct ~ WORLD_AREA_BIRTH)

# Run the crrr function on the simulated data with multivariate X1 and X2



crrr_inf.refugee1 <- inf.crrr(f1_refugeestat, f2_refugeestat, data=refugeedf, alpha=0.05, b.boot=10, link="logit", cor_based=FALSE, parallel=TRUE, rank_out=FALSE, reverse=TRUE)
crrr_inf.economic1 <- inf.crrr(f1_refugeestat, f2_refugeestat, data=nonrefugeedf, alpha=0.05, b.boot=10, link="logit", cor_based=FALSE, parallel=TRUE, rank_out=FALSE, reverse=TRUE)

saveRDS(crrr_inf.refugee1,"H:/Zheng_10223/ToVet/Output/crrr_refugee1_inf.RData")

saveRDS(crrr_inf.economic1,"H:/Zheng_10223/ToVet/Output/crrr_economic1_inf.RData")


write.csv(data.frame(crrr_inf.refugee1),"H:/Zheng_10223/ToVet/Output/crrr_refugee1_inf.csv")

write.csv(data.frame(crrr_inf.economic1),"H:/Zheng_10223/ToVet/Output/crrr_economic1_inf.csv")


########### Model 2:  Education
f1_refugeestat=as.formula(MainParent_Income_HH_MainParentAge45_49_pctparent ~  SchoolingBins_Main)
f2_refugeestat=as.formula(Child_Income_IND_30_34_pct ~  SchoolingBins_Main)



crrr_inf.refugee2 <- inf.crrr(f1_refugeestat, f2_refugeestat, data=refugeedf, alpha=0.05, b.boot=10, link="logit", cor_based=FALSE, parallel=TRUE, rank_out=FALSE, reverse=TRUE)
crrr_inf.economic2 <- inf.crrr(f1_refugeestat, f2_refugeestat, data=nonrefugeedf, alpha=0.05, b.boot=10, link="logit", cor_based=FALSE, parallel=TRUE, rank_out=FALSE, reverse=TRUE)

saveRDS(crrr_inf.refugee2,"H:/Zheng_10223/ToVet/Output/crrr_refugee2_inf.RData")

saveRDS(crrr_inf.economic2,"H:/Zheng_10223/ToVet/Output/crrr_economic2_inf.RData")

write.csv(data.frame(crrr_inf.refugee2),"H:/Zheng_10223/ToVet/Output/crrr_refugee2_inf.csv")

write.csv(data.frame(crrr_inf.economic2),"H:/Zheng_10223/ToVet/Output/crrr_economic2_inf.csv")



########### Model 3:  Intended Occupation
f1_refugeestat=as.formula(MainParent_Income_HH_MainParentAge45_49_pctparent ~  IntendedOccupation_Main)
f2_refugeestat=as.formula(Child_Income_IND_30_34_pct ~  IntendedOccupation_Main)



crrr_inf.refugee3 <- inf.crrr(f1_refugeestat, f2_refugeestat, data=refugeedf, alpha=0.05, b.boot=10, link="logit", cor_based=FALSE, parallel=TRUE, rank_out=FALSE, reverse=TRUE)
crrr_inf.economic3 <- inf.crrr(f1_refugeestat, f2_refugeestat, data=nonrefugeedf, alpha=0.05, b.boot=10, link="logit", cor_based=FALSE, parallel=TRUE, rank_out=FALSE, reverse=TRUE)

#saveRDS(crrr_refugee3,"H:/Zheng_10223/ToVet/Output/crrr_refugee3.RData")
saveRDS(crrr_inf.refugee3,"H:/Zheng_10223/ToVet/Output/crrr_refugee3_inf.RData")

#saveRDS(crrr_economic3,"H:/Zheng_10223/ToVet/Output/crrr_economic3.RData")
saveRDS(crrr_inf.economic3,"H:/Zheng_10223/ToVet/Output/crrr_economic3_inf.RData")


write.csv(data.frame(crrr_inf.refugee3),"H:/Zheng_10223/ToVet/Output/crrr_refugee3_inf.csv")

write.csv(data.frame(crrr_inf.economic3),"H:/Zheng_10223/ToVet/Output/crrr_economic3_inf.csv")


########### Model 4:  language
f1_refugeestat=as.formula(MainParent_Income_HH_MainParentAge45_49_pctparent ~  englishfrench)
f2_refugeestat=as.formula(Child_Income_IND_30_34_pct ~ englishfrench)

crrr_inf.refugee4 <- inf.crrr(f1_refugeestat, f2_refugeestat, data=refugeedf, alpha=0.05, b.boot=10, link="logit", cor_based=FALSE, parallel=TRUE, rank_out=FALSE, reverse=TRUE)
crrr_inf.economic4 <- inf.crrr(f1_refugeestat, f2_refugeestat, data=nonrefugeedf, alpha=0.05, b.boot=10, link="logit", cor_based=FALSE, parallel=TRUE, rank_out=FALSE, reverse=TRUE)

saveRDS(crrr_inf.refugee4,"H:/Zheng_10223/ToVet/Output/crrr_refugee4_inf.RData")

saveRDS(crrr_inf.economic4,"H:/Zheng_10223/ToVet/Output/crrr_economic4_inf.RData")

write.csv(data.frame(crrr_inf.refugee4),"H:/Zheng_10223/ToVet/Output/crrr_refugee4_inf.csv")

write.csv(data.frame(crrr_inf.economic4),"H:/Zheng_10223/ToVet/Output/crrr_economic4_inf.csv")


########### Model 5: landing age
f1_refugeestat=as.formula(MainParent_Income_HH_MainParentAge45_49_pctparent ~ LandingageMain)
f2_refugeestat=as.formula(Child_Income_IND_30_34_pct ~ LandingageMain)



crrr_inf.refugee5 <- inf.crrr(f1_refugeestat, f2_refugeestat, data=refugeedf, alpha=0.05, b.boot=10, link="logit", cor_based=FALSE, parallel=TRUE, rank_out=FALSE, reverse=TRUE)
crrr_inf.economic5 <- inf.crrr(f1_refugeestat, f2_refugeestat, data=nonrefugeedf, alpha=0.05, b.boot=10, link="logit", cor_based=FALSE, parallel=TRUE, rank_out=FALSE, reverse=TRUE)

saveRDS(crrr_inf.refugee5,"H:/Zheng_10223/ToVet/Output/crrr_refugee5_inf.RData")

saveRDS(crrr_inf.economic5,"H:/Zheng_10223/ToVet/Output/crrr_economic5_inf.RData")

write.csv(data.frame(crrr_inf.refugee5),"H:/Zheng_10223/ToVet/Output/crrr_refugee5_inf.csv")

write.csv(data.frame(crrr_inf.economic5),"H:/Zheng_10223/ToVet/Output/crrr_economic5_inf.csv")


########### Model 6: married
f1_refugeestat=as.formula(MainParent_Income_HH_MainParentAge45_49_pctparent ~ married)
f2_refugeestat=as.formula(Child_Income_IND_30_34_pct ~ married)



crrr_inf.refugee6 <- inf.crrr(f1_refugeestat, f2_refugeestat, data=refugeedf, alpha=0.05, b.boot=10, link="logit", cor_based=FALSE, parallel=TRUE, rank_out=FALSE, reverse=TRUE)
crrr_inf.economic6 <- inf.crrr(f1_refugeestat, f2_refugeestat, data=nonrefugeedf, alpha=0.05, b.boot=10, link="logit", cor_based=FALSE, parallel=TRUE, rank_out=FALSE, reverse=TRUE)

saveRDS(crrr_inf.refugee6,"H:/Zheng_10223/ToVet/Output/crrr_refugee6_inf.RData")

saveRDS(crrr_inf.economic6,"H:/Zheng_10223/ToVet/Output/crrr_economic6_inf.RData")

write.csv(data.frame(crrr_inf.refugee6),"H:/Zheng_10223/ToVet/Output/crrr_refugee6_inf.csv")

write.csv(data.frame(crrr_inf.economic6),"H:/Zheng_10223/ToVet/Output/crrr_economic6_inf.csv")


########### Model 7: all
f1_refugeestat=as.formula(MainParent_Income_HH_MainParentAge45_49_pctparent ~ WORLD_AREA_BIRTH + SchoolingBins_Main + englishfrench + IntendedOccupation_Main + married + LandingageMain)
f2_refugeestat=as.formula(Child_Income_IND_30_34_pct ~ WORLD_AREA_BIRTH + SchoolingBins_Main  + englishfrench + IntendedOccupation_Main + married + LandingageMain)



crrr_inf.refugee7 <- inf.crrr(f1_refugeestat, f2_refugeestat, data=refugeedf, alpha=0.05, b.boot=10, link="logit", cor_based=FALSE, parallel=TRUE, rank_out=FALSE, reverse=TRUE)
crrr_inf.economic7 <- inf.crrr(f1_refugeestat, f2_refugeestat, data=nonrefugeedf, alpha=0.05, b.boot=10, link="logit", cor_based=FALSE, parallel=TRUE, rank_out=FALSE, reverse=TRUE)

saveRDS(crrr_inf.refugee7,"H:/Zheng_10223/ToVet/Output/crrr_refugee7_inf.RData")

saveRDS(crrr_inf.economic7,"H:/Zheng_10223/ToVet/Output/crrr_economic7_inf.RData")



write.csv(data.frame(crrr_inf.refugee7),"H:/Zheng_10223/ToVet/Output/crrr_refugee7_inf.csv")

write.csv(data.frame(crrr_inf.economic7),"H:/Zheng_10223/ToVet/Output/crrr_economic7_inf.csv")
